Raw data available at GSE166376
Abstract
Background Environmental insults that activate the maternal immune system are potent primers of developmental neuropathology and maternal immune activation (MIA) has emerged as a risk factor for neurodevelopmental and psychiatric disorders. Animal models of MIA provide an opportunity to identify molecular pathways that initiate disease processes and lead to neuropathology and behavioral deficits in offspring. MIA-induced behaviors are accompanied by anatomical and neurochemical alterations in adult offspring that parallel those seen in affected human populations.
Methods We performed transcriptional profiling and neuroanatomical characterization in a time course across mouse embryonic cortical development, following MIA via single injection of the viral mimic polyinosinic:polycytidylic acid (polyI:C) at E12.5. Transcriptional changes identified in the cortex of MIA offspring at E17.5 were validated and mapped to cortical neuroanatomy and cell types via protein analysis and immunohistochemistry.
Results MIA induced strong transcriptomic signatures, including induction of genes associated with hypoxia, immune signaling, and angiogenesis. The acute response identified 6h after the MIA insult was followed by changes in proliferation, neuronal and glial differentiation, and cortical lamination that emerged at E14.5 and peaked at E17.5. Decreased numbers of proliferative cell types in germinal zones and alterations in neuronal and glial cell types across cortical lamina were identified in the MIA-exposed cortex.
Conclusions MIA-induced transcriptomic signatures in fetal offspring overlap significantly with perturbations identified in neurodevelopmental disorders (NDDs), and provide novel insights into alterations in molecular and developmental timing processes linking MIA and neuropathology, potentially revealing new targets for development of novel approaches for earlier diagnosis and treatment of these disorders.
# Directory structure
github_dir <- file.path("G:/Shared drives/Nord Lab - Computational Projects/MIA/Canales_et_al_eLIFE_DE_repo/Canales_eLIFE_2021_DE")
setwd(github_dir)
# Global R markdown code chunk options
knitr::opts_chunk$set(message=FALSE,
warning = FALSE,
error=FALSE,
echo=TRUE,
cache=TRUE,
fig.width = 7, fig.height = 6,
fig.align = 'left')
# R packages
library(sva)
library(RUVnormalize)
library(pheatmap)
library(edgeR)
library(GenomicFeatures)
library(mclust)
library(parallel)
library(ggplot2)
library(reshape2)
library(dplyr)
library(data.table)
library(DT)
library(RColorBrewer)
library(knitr)
library(ggrepel)
library(gridExtra)
library(grid)
library(plyr)
library(plotly)
library(Hmisc)
# Reads input files
setwd(github_dir)
exp.data <- read.csv("./Extra_submission files/Gene_counts.csv")
rownames(exp.data) <- exp.data$X
exp.data$X <- NULL
sample.info <- read.csv("./Extra_submission files/metadata.csv")
load("./Extra_submission files/exonic.gene.sizes")
# Exonic gene sizes were calculated as follows:
# Generate mm9 list of exon sizes
# Method description at: https://www.biostars.org/p/83901/
#txdb <- makeTxDbFromGFF("G:/Shared drives/Nord Lab - Personal #Folders/Karol/Genomes/Mus_musculus/UCSC/mm9/Annotation/genes.gtf", #format="gtf")
#exons.list.per.gene <- exonsBy(txdb,by="gene")
# I paralelized this increasing the speed of the process >2x on a 4-core (logical) machine.
# Use mclapply instead of parLapply if you use a Mac.
#cl <- makeCluster(detectCores())
#exonic.gene.sizes <- parallel::parLapply(cl, #exons.list.per.gene,function(x){sum(width(reduce(x)))})
#stopCluster(cl)
sample.info_submission <- read.csv("Extra_submission files/Supplementary Table 5, RNA-seq sample metadata_03_02_2021.csv")
datatable(sample.info_submission,
rownames = FALSE,
filter = list(position = 'top', clear = FALSE, plain = FALSE),
options = list(paging = FALSE, scrollX=T, scrollY="300px", searching = FALSE))
# Qualifies F or M based on the Xist expression
sex.by.rna <- c(ifelse(exp.data["Xist",]>1000,"F","M")) #
Xist_exp <- as.data.frame(reshape2::melt(exp.data["Xist",]))
Xist_exp <- cbind(Xist_exp, sex.by.rna)
Xist_exp <- arrange(Xist_exp, value)
colnames(Xist_exp) <- c("variable", "counts", "sex.by.rna")
ggplot(Xist_exp, aes(x=sex.by.rna, y=counts, colours=sex.by.rna))+
geom_jitter()+
geom_boxplot(alpha=0.2)+
theme_bw()+
labs(title="Xist read counts", x = "", y = "Read counts")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_x_discrete(breaks= c("F", "M"), labels=c("Females", "Males"))
Sample sex assignment based on Xist gene read count. Samples with less than 1000 Xist reads per sample are males, and samples with more than 1000 Xist reads are females.
# Calculates RPKM values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
rpkm.data_linear_all <- rpkm.data_linear
#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_24015.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_24015.csv")
### Removes genes with low expression ###
# Lets plot a histogram of all rpkm values in a dataset.
#dim(rpkm.data) #24015 rows and 74 columns
df <- reshape2::melt(rpkm.data)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("gene_name", "sample", "rpkm_log2","rpkm_linear")
# Histogram of all rpkm values in a dataset.
ggplot(df, aes(x=rpkm_log2))+
geom_histogram(bins = 1000, color="black")+
theme_bw()+
labs(title="Unfiltered dataset of 24015 genes",
x="log2 RPKM", y = "Read counts")+
theme(plot.title = element_text(hjust = 0.5))
Histograms of RPKM gene expression values before and after excluding genes with very low expression.
# Setting the threshold for minimum rpkm value in at least 2 samples.
threshold = -2
# Nymber of genes after filtering
#sum(reshape2::melt(rowSums(rpkm.data > threshold) >= 2)$value) # 17051
# Let's filter the dataset and plot the histogram distribution
keep <- as.data.frame(reshape2::melt(rowSums(rpkm.data > threshold) >= 2))
keep$gene_name <- rownames(keep)
keep <- filter(keep, value == "TRUE")$gene_name
# NOTE: The set of included genes reported in the manuscript was determined using expression criteria from an expanded set of samples including a condition that was not used for the manuscript. The difference between the gene counts is 17195 in the manuscript from the larger set of samples, compared to 17051 genes that would pass in the set of samples used in the MIA vs. control comparison. Note that there is no difference among DEG passing FDR < 0.05 using either gene set.
original_exp.data <- read.csv("Extra_submission files/Count_gene_set_from_initial_submission.csv")
# length(keep) # 17051
# length(original_exp.data$gene_name) # 17195
keep <- original_exp.data$gene_name
#
datExpr_test <- as.data.frame(rpkm.data)
datExpr_test$gene_name <- rownames(datExpr_test)
# Filters the "keep" genes
datExpr_test <- filter(datExpr_test, gene_name %in% keep)
# Generates a matrix-type data frame
rownames(datExpr_test) <- datExpr_test$gene_name
datExpr_test$gene_name <- NULL
# Overwrites the original datExpr object.
datExpr <- datExpr_test
# Plots the histogram after filtering
df <- reshape2::melt(datExpr)
df$rpkm_linear <- 2^df$value # This converts the log2 RPKM values back to linear scale
colnames(df) <- c("sample", "rpkm_log2","rpkm_linear")
ggplot(df, aes(x=rpkm_log2))+
geom_histogram(bins = 1000, color="black")+
theme_bw()+
labs(title = "Dataset of 17195 genes \n after excluding genes expressed at a very low level", x="log2 RPKM", y = "Read counts")+
theme(plot.title = element_text(hjust = 0.5))
Histograms of RPKM gene expression values before and after excluding genes with very low expression.
# Removing low expression genes. Overwrites exp.data object used for DE analysis ###
#dim(exp.data) #exp.data contains counts, datExpr contains rpkms
exp.data$gene_name <- rownames(exp.data)
exp.data <- filter(exp.data, gene_name %in% rownames(datExpr))
rownames(exp.data) <- exp.data$gene_name
exp.data$gene_name <- NULL
# Recalculates rpkm values
# Gene lengths calculated with lapply
gene.lengths <- as.numeric(lapply(1:nrow(exp.data), function(x) FUN= as.numeric(exonic.gene.sizes[rownames(exp.data)[x]])))
# RPKM calculation
rpkm.data <- rpkm(exp.data, gene.length=gene.lengths, log=T, prior.count=.25)
rpkm.data_linear <- rpkm(exp.data, gene.length=gene.lengths, log=F)
#write.csv(rpkm.data, file = "GEO_submission #files/rpkm_log2_17051.csv")
#write.csv(rpkm.data_linear, file = "GEO_submission #files/rpkm_linear_17051.csv")
setwd(github_dir)
group <- ifelse(sample.info[,"Condition"]=="Saline",1,2)
rRNA <- as.numeric(sample.info$rRNA)
sex.by.rna <- factor(sample.info$sex.by.rna)
dpc <- factor(sample.info$DPC)
response <- factor(sample.info$Response)
lane <- factor(sample.info$Lane)
source("Dimensionality reduction plots.r")
grid.arrange(PCA_plot, PCA_plot_sex, ncol = 2)
Figure 1bc. Principal component analysis (PCA) of RNA-seq data indicates that developmental age accounts for the majority of variance across samples. Age (DPC) or sex are represented as colored symbols in (b - left) and (c - right) respectively, and poly(I:C) or saline treatment indicated by circles or triangles, respectively
# 19.5 was used as a numerical representation of the P0 time point.
test.dpc <- as.factor(sample.info$DPC)
test.sex.by.rna <- as.factor(sample.info$sex_by_rna)
test.response <- as.factor(sample.info$Response)
test.rRNA <- sample.info$rRNA
test.group <- as.factor(sample.info$Condition)
test.lane <- as.factor(sample.info$Lane)
test.data <- exp.data
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
#qCML expression modeling
y <- DGEList(counts=test.data, group=test.group)
keep <- rowSums(cpm(y)>min.cpm) >=2
y <- y[keep, , keep.lib.sizes=FALSE]
## Perform simple exact test on group
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
test.pseudo <- y$pseudo.counts
pca.results <- prcomp(test.pseudo)
#PCA dataframe does not contain all samples as the exp.data, so I have to find which datapoints are control and polyic now
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
train.data <- data.frame(PCA.1=pca.results$rotation[control.datapoints,1],
PCA.2=pca.results$rotation[control.datapoints,2],
PCA.3=pca.results$rotation[control.datapoints,3],
PCA.4=pca.results$rotation[control.datapoints,4],
PCA.5=pca.results$rotation[control.datapoints,5],
rRNA=as.numeric(test.rRNA[control.datapoints]),
sex=sex.by.rna[control.datapoints],
lane=test.lane[control.datapoints],
response=test.response[control.datapoints]
)
predict.data <- data.frame(PCA.1=pca.results$rotation[polyic.datapoints,1],
PCA.2=pca.results$rotation[polyic.datapoints,2],
PCA.3=pca.results$rotation[polyic.datapoints,3],
PCA.4=pca.results$rotation[polyic.datapoints,4],
PCA.5=pca.results$rotation[polyic.datapoints,5],
rRNA=as.numeric(test.rRNA[polyic.datapoints]),
sex=sex.by.rna[polyic.datapoints],
lane=test.lane[polyic.datapoints],
response=test.response[polyic.datapoints]
)
lm.model <- lm(as.numeric(as.character(test.dpc[control.datapoints])) ~
PCA.1 + PCA.2 + PCA.3 + PCA.4 + PCA.5 + sex + lane,
data = train.data
)
# Predicts DPC from the linear model built on train data.
predict.saline <- predict(lm.model, newdata = train.data)
predict.polyic <- predict(lm.model, newdata = predict.data)
# I had some trouble with the original for loop and I replaced it with lapply. I also added the sd values and a plot representing modeled DPC values.
dpc.predict.mean.saline <- lapply(1:4, function(x) FUN=mean(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))
dpc.predict.sd.saline <- lapply(1:4, function(x) FUN=sd(predict.saline[which(as.character(test.dpc[control.datapoints])==unique(test.dpc)[x])]))
dpc.predict.mean.polyic <- lapply(1:4, function(x) FUN=mean(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))
dpc.predict.sd.polyic <- lapply(1:4, function(x) FUN=sd(predict.polyic[which(as.character(test.dpc[polyic.datapoints])==unique(test.dpc)[x])]))
dpc.predict.mean.saline <- as.numeric(dpc.predict.mean.saline)
dpc.predict.sd.saline <- as.numeric(dpc.predict.sd.saline)
dpc.predict.mean.polyic <- as.numeric(dpc.predict.mean.polyic)
dpc.predict.sd.polyic <- as.numeric(dpc.predict.sd.polyic)
lm.predicted.dpc <- data.frame(dpc.predict.mean.saline, dpc.predict.sd.saline, dpc.predict.mean.polyic, dpc.predict.sd.polyic)
####
df_1 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[control.datapoints])),
"Predicted_DPC" = predict.saline, "Condition" = rep("Saline", length(predict.saline)))
df_2 <- data.frame("Actual_DPC" = as.numeric(as.character(test.dpc[polyic.datapoints])),
"Predicted_DPC" = predict.polyic, "Condition" = rep("Poly(I:C)", length(predict.polyic)))
df_combined <- rbind(df_1, df_2)
mean_data <- group_by(df_combined, Actual_DPC, Condition) %>%
summarise(average_predicted_dpc = mean(Predicted_DPC, na.rm = TRUE),
sd_predicted_dpc = sd(Predicted_DPC, na.rm = TRUE)
)
mean_data <- as.data.frame(mean_data)
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
df_combined$Condition <- ifelse(df_combined$Condition == "Saline", "Control", as.character(df_combined$Condition))
mean_data$Condition <- ifelse(mean_data$Condition == "Saline", "Control", as.character(mean_data$Condition))
Fig_4c <- ggplot()+
geom_jitter(data=df_combined, aes(x = Actual_DPC, y = Predicted_DPC, color=Condition), size = 2, height= 0.3, alpha=0.7)+
geom_line(data=mean_data, aes(x = Actual_DPC, y = average_predicted_dpc, color=Condition), size=1.5, alpha=0.7)+
theme_bw()+
scale_color_manual(values=c("Control"=j_brew_colors[2], "Poly(I:C)" = j_brew_colors[1]))+
scale_x_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
scale_y_continuous(breaks= c(12.5, 14.5, 17.5, 19.5), labels=c("E12.5", "E14.5", "E17.5", "P0"))+
labs(y = "Predicted DPC", x = "Actual DPC")+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
Fig_4c
Figure 4c. Temporal modeling of the RNA-seq data suggests acceleration of the neurodevelopmental program in MIA animals at E17.5 (P = 4.5e-06, Student’s t-test) and similar trend at E14.5 (P = 0.07, Student’s t-test). Actual age (X-axis) vs predicted age (Y-axis) calculated by the linear model. Control samples were used for training the model. Lines connect average values, points depict individual samples and are jittered along the X-axis.
#Nicer dataframe
predicted.dpc.df <- data.frame("Real_DPC"=c(12.5, 14.5, 17.5, 19.5), "Predicted_Saline_Mean_DPC"= dpc.predict.mean.saline, "Predicted_Saline_SD_DPC"=dpc.predict.sd.saline, "Predicted_PolyIC_Mean_DPC"= dpc.predict.mean.polyic, "Predicted_PolyIC_SD_DPC"=dpc.predict.sd.polyic)
##### T test ####
t.test.12 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="12.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="12.5")])
t.test.14 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="14.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="14.5")])
t.test.17 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="17.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="17.5")])
t.test.19 <- t.test(predict.saline[which(group[control.datapoints]==1 & as.character(dpc[control.datapoints])=="19.5")], predict.polyic[which(group[polyic.datapoints]==2 & as.character(dpc[polyic.datapoints])=="19.5")])
t.test.modeled.DPC.p.val <- data.frame("p-value E12.5"=t.test.12$p.value, "p-value E14.5"=t.test.14$p.value, "p-value E17.5"=t.test.17$p.value, "p-value E19.5"=t.test.19$p.value)
predicted_DPC_df <- reshape2::melt(data.frame("mean E12.5"=t.test.12$estimate, "mean E14.5"=t.test.14$estimate, "mean E17.5"=t.test.17$estimate, "mean E19.5"=t.test.19$estimate))
t_test_p_value=c(t.test.12$p.value, t.test.14$p.value, t.test.17$p.value, t.test.19$p.value)
specify_decimal <- function(x, k) trimws(format(round(x, k), nsmall=k))
predicted_DPC_df_nice <- data.frame("Real_DPC"= c(12.5, 14.5, 17.5, 19.5), "Predicted_DPC_Saline"=predicted_DPC_df$value[c(1,3,5,7)], "Predicted_DPC_PolyIC"=predicted_DPC_df$value[c(2,4,6,8)], "t_test_p_value"=specify_decimal(t_test_p_value, 8))
knitr::kable(predicted_DPC_df_nice)
| Real_DPC | Predicted_DPC_Saline | Predicted_DPC_PolyIC | t_test_p_value |
|---|---|---|---|
| 12.5 | 12.83713 | 12.69296 | 0.77760404 |
| 14.5 | 14.62900 | 15.61387 | 0.08843077 |
| 17.5 | 16.80196 | 19.37435 | 0.00005209 |
| 19.5 | 19.51513 | 19.59144 | 0.70087733 |
single_timepoint_glm_function<- function(x){
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
design <- model.matrix(~as.factor(test.lane) + as.factor(test.sex.by.rna) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM <- single_timepoint_glm_function(12.5)
E14.5_GLM <- single_timepoint_glm_function(14.5)
E17.5_GLM <- single_timepoint_glm_function(17.5)
E19.5_GLM <- single_timepoint_glm_function(19.5)
E12.5_up_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(E12.5_GLM, PValue < 0.05, logFC < 0))
E14.5_up_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(E14.5_GLM, PValue < 0.05, logFC < 0))
E17.5_up_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(E17.5_GLM, PValue < 0.05, logFC < 0))
E19.5_up_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(E19.5_GLM, PValue < 0.05, logFC < 0))
###
E12.5_up_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(E12.5_GLM, FDR < 0.05, logFC < 0))
E14.5_up_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(E14.5_GLM, FDR < 0.05, logFC < 0))
E17.5_up_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(E17.5_GLM, FDR < 0.05, logFC < 0))
E19.5_up_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(E19.5_GLM, FDR < 0.05, logFC < 0))
DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
"E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
"E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
"P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))
colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")
DE_df_n_melted <- reshape2::melt(DE_df_n)
DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))
DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)
Fig1f <- ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
geom_bar(position = "dodge", stat="identity", color="black")+
theme_bw()+
#scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
theme(legend.title=element_blank())+
labs(title="Number of DE genes with P < 0.05 and FDR < 0.05", x="", y="")+
theme(plot.title = element_text(size = rel(1.2), hjust=0.5))+
geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
coord_flip()+
scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
theme(panel.border = element_blank(),
#legend.key = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(angle = 90, size = 14, face = "bold",
hjust = 0.5, vjust = -4),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position="bottom")
Fig1f
Figure 1f. Numbers of upregulated and downregulated DE genes at FDR < 0.05 and P < 0.05 thresholds show varying DE gene numbers with a peak of dysregulated genes at E17.5.
To improve the performance of interactive volcano plots, only genes with P < 0.05 are displayed. Tables contain full DE data sets.
volcano_plot_text_2 <- function(x, title) {
x <- dplyr::filter(x, PValue <0.05)
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
p <- ggplot(plotDat, aes(x = logFC, y=-log10(PValue), fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
################# Version capped at different y axis levels ###############
volcano_plot_text_3 <- function(x, title) {
x <- dplyr::filter(x, PValue <0.05)
test <- ifelse(x$PValue, "Non significant")
test <- ifelse(x$logFC > 0 & x$PValue <0.05, "PValue < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$PValue <0.05, "PValue < 0.05 & logFC < 0", test)
test <- ifelse(x$logFC > 0 & x$FDR <0.05, "FDR < 0.05 & logFC > 0", test)
test <- ifelse(x$logFC < 0 & x$FDR <0.05, "FDR < 0.05 & logFC < 0", test)
plotDat <- data.frame(x, Group=test)
plotDat$logFC <- ifelse(plotDat$logFC > 4, 4, plotDat$logFC)
plotDat$logFC <- ifelse(plotDat$logFC < -4, -4, plotDat$logFC)
plotDat$log10_PValue <- -log10(plotDat$PValue)
plotDat$log10_PValue <- ifelse(plotDat$log10_PValue > 8, 8, plotDat$log10_PValue)
#scale_fill_manual(values=c("#1F78B4" - blue, "#62a0ca" - lighter blue, "#E31A1C" - red, "#eb5e60" - lighter red))+
p <- ggplot(plotDat, aes(x = logFC, y=log10_PValue, fill=Group, col = Group)) +
geom_point(aes(text=gene_name), size=0.5, pch=21, alpha=0.7, stroke = 0.5)+
theme_light()+
scale_fill_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
scale_color_manual(values=c("Non significant"="grey30",
"PValue < 0.05 & logFC > 0"="#eb5e60",
"PValue < 0.05 & logFC < 0"="#62a0ca",
"FDR < 0.05 & logFC > 0" = "#960304",
"FDR < 0.05 & logFC < 0" = "#01538a"))+
labs(title= title, y="", x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
theme(legend.text=element_text(size=16))+
theme(axis.text=element_text(size=14))+
theme(legend.title=element_blank())+
theme(axis.text=element_text(size=14))+
theme(axis.title = element_text(size=14, face = "bold"))+
theme(legend.position="bottom")+
coord_cartesian(xlim = c(-4.2, 4.2))+
expand_limits(x = c(-4.2, 4.2))+
coord_cartesian(ylim = c(0, 8))+
geom_hline(yintercept = -log10(0.05), linetype=2)+
scale_x_continuous(breaks=c(-4, -2, 0, 2, 4), labels=c("<-4","-2", "0", "2", ">4"))+
scale_y_continuous(breaks=c(0, 2, 4, 6, 8), labels=c("0","2", "4", "6", ">8"))+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggplotly(p) %>%
layout(legend = list(
orientation = "v",
y = -0.1,
font=list(
size=14
)
)
)
}
p <- volcano_plot_text_3(E12.5_GLM, "E12.5")
p
Figure 1e.
datatable(E12.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(E14.5_GLM, "E14.5")
p
Figure 1e.
datatable(E14.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_2(E17.5_GLM, "E17.5")
p
Figure 1e.
datatable(E17.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
p <- volcano_plot_text_3(E19.5_GLM, "P0")
p
Figure 1e.
datatable(E19.5_GLM, rownames = FALSE, filter="top", options = list(pageLength = 15, scrollX=T))
#Build dataframe of logFC values
padding <- 0
a <- data.frame("gene_name"=E12.5_GLM$gene_name, "E12.5_logFC"=E12.5_GLM$logFC+padding)
b <- data.frame("gene_name"=E14.5_GLM$gene_name, "E14.5_logFC"=E14.5_GLM$logFC+padding)
c <- data.frame("gene_name"=E17.5_GLM$gene_name, "E17.5_logFC"=E17.5_GLM$logFC+padding)
d <- data.frame("gene_name"=E19.5_GLM$gene_name, "E19.5_logFC"=E19.5_GLM$logFC+padding)
e <- merge(a, b, by="gene_name")
f <- merge(c,d, by="gene_name")
df <- merge(e,f, by="gene_name")
FDR_threshold <- 0.05
logFC_threshold <- 1
a <- dplyr::filter(E12.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
b <- dplyr::filter(E14.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
c <- dplyr::filter(E17.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
d <- dplyr::filter(E19.5_GLM, FDR < FDR_threshold &
logFC > logFC_threshold | logFC < -1*logFC_threshold)$gene_name
DE_genes_logFC_1_20_top <- unique(c(a, b, c, d))
#length(DE_genes_logFC_1_20_top)
#Filter DE data frame
df_1 <- dplyr::filter(df, gene_name %in% DE_genes_logFC_1_20_top)
rownames(df_1) <- df_1$gene_name
df_1$gene_name <- NULL
colnames(df_1) <- c("E12.5", "E14.5", "E17.5", "P0")
rownames(df_1) <- NULL
df_1 <- df_1[,c(1:4)]
paletteLength <- 100
test <- as.matrix(df_1)
myBreaks <- c(seq(min(test), 0, length.out=ceiling(paletteLength/2) + 1),
seq(max(test)/paletteLength, max(test), length.out=floor(paletteLength/2)))
myColor <- colorRampPalette(c("#9E0041", "#fea11d","#fea11d", "white", "#43abea", "#0a48aa", "#13008e"))(paletteLength)
Fig_1d <- pheatmap(as.matrix(df_1),
clustering_distance_rows="correlation",
cluster_rows = T,
cluster_cols = F,
legend = T,
angle_col = 0,
legend_breaks = c(-4, -2, 0, 2, 4, max(df_1)),
legend_labels = c("-4", "-2", "0", "2", "4","log2FC"),
fontsize_row = gene_name_font_size,
color = rev(myColor),
breaks = myBreaks,
main = bquote(atop("Heatmap of DE genes",
"FDR < 0.05 and log" [2] *"FC >1 or <-1")))
Fig_1d
Figure 1d. Heatmap representing relative gene expression changes between control and MIA samples across time-points. Hierarchical clustering by relative fold changes shows stage-specific differential expression signatures (DE genes shown have FDR < 0.05 and log2 fold change (log2FC) > 1 or < -1).
SFARI_genes <- read.csv("./Extra_submission files/SFARI-Gene_genes_01-15-2019release_02-26-2019export.csv")
#head(SFARI_genes)
#We are using only high confidence genes associated with autism
SFARI_genes_filtered <- filter(SFARI_genes, gene.score <3)
require("biomaRt")
#Function translating human to mouse orthologs
humanToMouse <- function(x){
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# comparison between mouse and human, returns mouse gene equivalents
genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
mousex <- unique(genes[, 2])
return(mousex)
}
#Returning a df matching orthologs
humanToMouse_2 <- function(x){
human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
# comparison between mouse and human, returns mouse gene equivalents
genes = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
mousex <- genes
return(mousex)
}
#Translating orthologs
SFARI_genes_mouse <- humanToMouse_2(SFARI_genes$gene.symbol)
#SFARI_genes_mouse
SFARI_genes_with_ortho <- merge(SFARI_genes, SFARI_genes_mouse, by.x= "gene.symbol", by.y = "HGNC.symbol")
#making sure still have all the 87 SFARI genes with gene.score 1 and 2
x <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[,c(1,9)]
SFARI_df_to_save <- filter(SFARI_genes_with_ortho, gene.score %in% c(1,2))[ ,c(1, 3:9)]
colnames(SFARI_df_to_save)[8] <- "MGI.symbol_ortholog"
#write.csv(SFARI_df_to_save, file="Supplementary Table 1, #High_confidence_SFARI_mouse_orthologs.csv",row.names = F)
#length(unique(x$gene.symbol)) # 87 Human SFARI genes
#length(unique(x$MGI.symbol)) # 89 mouse orthologs; MSNP1AS doesn't have an ortholog; DDX3X has 2 orthologs, SRCAP has 2 orthologs too.
#setdiff(SFARI_genes_filtered$gene.symbol, SFARI_genes_with_ortho$gene.symbol)
#### Circular enrichment plot and permutation test enrichment analysis####
`%notin%` <- Negate(`%in%`)
SFARI_DE_df_annotation <- function(x){
cat_1 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 1)$MGI.symbol)
cat_2 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 2)$MGI.symbol)
cat_3 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 3)$MGI.symbol)
cat_4 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 4)$MGI.symbol)
cat_5 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 5)$MGI.symbol)
cat_6 <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == 6)$MGI.symbol)
cat_NA <- filter(x, gene_name %in% filter(SFARI_genes_with_ortho, gene.score == "NA")$MGI.symbol)
cat_Non_Ausitm <- filter(x, gene_name %notin% filter(SFARI_genes_with_ortho, gene.score %in% c(1, 2,3,4,5,6, "NA"))$MGI.symbol)
cat_1$SFARI_category <- rep(1, nrow(cat_1))
cat_2$SFARI_category <- rep(2, nrow(cat_2))
cat_3$SFARI_category <- rep(3, nrow(cat_3))
cat_4$SFARI_category <- rep(4, nrow(cat_4))
cat_5$SFARI_category <- rep(5, nrow(cat_5))
cat_6$SFARI_category <- rep(6, nrow(cat_6))
cat_NA$SFARI_category <- rep(NA, nrow(cat_NA))
cat_Non_Ausitm$SFARI_category <- rep("Non_Ausitm", nrow(cat_Non_Ausitm))
test <- rbind(cat_1, cat_2, cat_3, cat_4, cat_5, cat_6, cat_NA, cat_Non_Ausitm)
test$Up_or_Down <- ifelse(test$logFC > 0, "Upregulated", "Downregulated")
test
}
E12.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E12.5_GLM)
E14.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E14.5_GLM)
E17.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E17.5_GLM)
E19.5_GLM_SFARI_cat <- SFARI_DE_df_annotation(E19.5_GLM)
E12.5_GLM_SFARI_cat$DPC <- rep("E12.5", nrow(E12.5_GLM_SFARI_cat))
E14.5_GLM_SFARI_cat$DPC <- rep("E14.5", nrow(E14.5_GLM_SFARI_cat))
E17.5_GLM_SFARI_cat$DPC <- rep("E17.5", nrow(E17.5_GLM_SFARI_cat))
E19.5_GLM_SFARI_cat$DPC <- rep("E19.5", nrow(E19.5_GLM_SFARI_cat))
#Checking the number of SFARI genes with gene.score 1 or 2 at each timepoint
#filter(E12.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E14.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E17.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
#filter(E19.5_GLM_SFARI_cat, SFARI_category %in% c(1,2)) #82 genes
SFARI_genes_filtered_mouse <- x$MGI.symbol
#Double checking how many of the SFARI orthologs overlap with genes in DE analysis
#length(unique(SFARI_genes_filtered_mouse)) #There are 89 mouse orthologs of gene.score 1 and 2 SFARI genes
#filter(E12.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E14.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E17.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#filter(E19.5_GLM, gene_name %in% SFARI_genes_filtered_mouse) #82 genes
#The missing 7 genes are just not sufficiently expressed.
##### Stacked circular bar plot - the one in figure 2 #####
df <- rbind(E12.5_GLM_SFARI_cat, E14.5_GLM_SFARI_cat, E17.5_GLM_SFARI_cat, E19.5_GLM_SFARI_cat)
# Filtering SFARI genes with gene.score 1 and 2
df_test <- filter(df, SFARI_category %in% c(1,2))
#I'm replacing the real values of logFC to 1 and -1 to build a stacked bar plot
df_test$logFC <- ifelse(df_test$logFC > 0, 1, -1)
#Annotates if a gene is significant
df_test$Sig <- as.character(ifelse(df_test$PValue < 0.05, "DE", "Non-DE"))
#A combination fo DPC and significance
df_test$DPC_Sig <- paste(df_test$DPC, df_test$Sig, sep="_")
my_col <- brewer.pal(n = 8, name = "Set1")
#head(df_test)
#Add Up or down
df_test$dir <- ifelse(df_test$logFC > 0, "Up", "Down") # Redundant
df_test$DPC_Sig_dir <- paste(df_test$DPC, df_test$Sig, df_test$dir, sep = "_")
df_test <- arrange(df_test, gene_name)
#Yes, I checked the gene names are the same in the columns
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E14.5")$gene_name)
#all(filter(df_test, DPC == "E17.5")$gene_name == filter(df_test, DPC == "E19.5")$gene_name)
#all(filter(df_test, DPC == "E12.5")$gene_name == filter(df_test, DPC == "E17.5")$gene_name)
## A better matrix for clustering with 0 of non-DE logFC values
df_test$logFC_sig <- ifelse(df_test$PValue > 0.05, 0, df_test$logFC)
df_matrix <- data.frame("gene_name" = filter(df_test, DPC == "E12.5")$gene_name,
"E12.5" = filter(df_test, DPC == "E12.5")$logFC_sig,
"E14.5" = filter(df_test, DPC == "E14.5")$logFC_sig,
"E17.5" = filter(df_test, DPC == "E17.5")$logFC_sig,
"E19.5" = filter(df_test, DPC == "E19.5")$logFC_sig)
#head(df_matrix)
rownames(df_matrix) <- df_matrix$gene_name
df_matrix$gene_name <- NULL
# Dissimilarity matrix
d <- dist(df_matrix, method = "euclidean")
# Hierarchical clustering using Complete Linkage
hc1 <- hclust(d, method = "ward.D2" )
# Plot dendrogram
#plot(hc1, cex = 0.6, hang = -1)
#head(df_test)
df_test$gene_name <- factor(df_test$gene_name, levels = unique(df_test$gene_name)[hc1$order])
df_test <- arrange(df_test, gene_name)
### Angle of labels
step <- 360/length(unique(df_test$gene_name))
myAng <- seq(0, 360, step)
myAng <- rev(myAng)
myAng <- myAng - 90
myAng <- ifelse(myAng > 90 & myAng < 270, myAng-180, myAng)
#Adding FDR
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E12.5", "E12.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E14.5", "E14.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E17.5", "E17.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1 & df_test$DPC == "E19.5", "E19.5_Down_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E12.5", "E12.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E14.5", "E14.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E17.5", "E17.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$DPC_Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1 & df_test$DPC == "E19.5", "E19.5_Up_FDR", df_test$DPC_Sig_dir)
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == -1, "P_Down", "Non-Sig")
df_test$Sig_dir <- ifelse(df_test$PValue < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == -1, "P_Down", df_test$Sig_dir)
df_test$Sig_dir <- ifelse(df_test$FDR < 0.05 & df_test$logFC == 1, "P_Up", df_test$Sig_dir)
#####
Fig1g <- ggplot(df_test, aes(x=gene_name, y = abs(logFC), fill=DPC_Sig_dir))+
geom_bar(stat = "identity", alpha = 1, position = "stack")+
annotate("rect", xmin=0, xmax=Inf, ymin=-1, ymax=0, alpha=1, fill="white")+
annotate("rect", xmin=0, xmax=Inf, ymin=0, ymax=1, alpha=0.5, fill="grey10")+
annotate("rect", xmin=0, xmax=Inf, ymin=1, ymax=2, alpha=0.5, fill="grey30")+
annotate("rect", xmin=0, xmax=Inf, ymin=2, ymax=3, alpha=0.5, fill="grey60")+
annotate("rect", xmin=0, xmax=Inf, ymin=3, ymax=4, alpha=0.5, fill="grey90")+
geom_bar(stat = "identity", alpha = 1, position = "stack")+
coord_polar()+
theme_minimal()+
geom_hline(yintercept = 0, size=1, linetype=1)+
geom_hline(yintercept = 1, size=1, linetype=1)+
geom_hline(yintercept = 2, size=1, linetype=1)+
geom_hline(yintercept = 3, size=1, linetype=1)+
geom_hline(yintercept = 4, size=1, linetype=1)+
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())+
labs(x="", y="")+
theme(legend.position = "none")+
scale_fill_manual(values = c(
"E12.5_DE_Up"= "#eb8889",
"E14.5_DE_Up" = "#eb8889",
"E17.5_DE_Up" = "#eb8889",
"E19.5_DE_Up" = "#eb8889",
"E12.5_Non-DE_Up"= NA,
"E14.5_Non-DE_Up" = NA,
"E17.5_Non-DE_Up" = NA,
"E19.5_Non-DE_Up" = NA,
"E12.5_DE_Down"= "#83aecc",
"E14.5_DE_Down" = "#83aecc",
"E17.5_DE_Down" = "#83aecc",
"E19.5_DE_Down" = "#83aecc",
"E12.5_Non-DE_Down"= NA,
"E14.5_Non-DE_Down" = NA,
"E17.5_Non-DE_Down" = NA,
"E19.5_Non-DE_Down" = NA,
############################
"E12.5_Down_FDR" = my_col[2],
"E14.5_Down_FDR" = my_col[2],
"E17.5_Down_FDR" = my_col[2],
"E19.5_Down_FDR" = my_col[2],
"E12.5_Up_FDR" = my_col[1],
"E14.5_Up_FDR" = my_col[1],
"E17.5_Up_FDR" = my_col[1],
"E19.5_Up_FDR" = my_col[1]
))+
ylim(-1,4)+
theme(axis.text.x = element_text(angle = myAng, size =13, color="black"))+
theme(panel.border = element_blank(),
legend.key = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank())+
annotate(geom = "text", x = 72, y = 0.5, label = "P0", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 1.46, label = "E17.5", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 2.47, label = "E14.5", color = "black",
angle = 50, size =10)+
annotate(geom = "text", x = 72, y = 3.47, label = "E12.5", color = "black",
angle = 50, size =10)
Fig1g
Figure 1g. Intersection of stage-specific DE genes with the 82 SFARI autism-associated mouse gene orthologs that were expressed at measurable levels in the RNA-seq data. E17.5 DE genes (FDR < 0.05) were enriched for SFARI genes (P = 3.9e-04, hypergeometric test). Concentric circles represent developmental time-points. Light red, upregulated DE genes (P < 0.05); dark red, upregulated DE genes (FDR < 0.05); light blue, downregulated DE genes (P < 0.05); dark blue, downregulated DE genes (FDR < 0.05).
##### Permutation test per GOmpers et al. #####
# Code adopted from Alex's analysis
#All genes in the background
#Enrichment fo SFARI genes among DE genes.
permutation_test <- function(x, plot.name){
# x <- E17.5_GLM_module_SFARI_cat
binary.score.reference <- ifelse(x$gene_name %in% filter(x, SFARI_category %in% c(1,2))$gene_name, 1,0)
binary.score.test <- ifelse(filter(x, PValue < 0.05)$gene_name %in% filter(x, PValue <0.05, SFARI_category %in% c(1,2))$gene_name, 1,0)
geneset.perm.test <- function(binary.score.reference, binary.score.test, iterations, plot.name) {
set.seed(1234)
binary.score.reference <- ifelse(is.na(binary.score.reference),0,binary.score.reference)
binary.score.test <- ifelse(is.na(binary.score.test),0,binary.score.test)
count.criteria <- vector(length=iterations)
test.size <- length(binary.score.test)
for (index in 1:iterations) {
count.criteria[index] <- sum(sample(binary.score.reference, test.size, replace = F))
}
x.min <- min(c(count.criteria, sum(binary.score.test))) - 20
if (x.min < 0) {x.min <- 0}
x.max<- max(c(count.criteria, sum(binary.score.test))) + 20
z <- abs(mean(count.criteria) - sum(binary.score.test))/sd(count.criteria)
p <- 2*pnorm(-abs(z))
e <- sum(binary.score.test)/mean(count.criteria)
prop <- sum(binary.score.test)/length(binary.score.test)
bg <- length(binary.score.reference)
hist(count.criteria, col="gray", xlab="Count", ylab="Frequency",
main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ", length(binary.score.test), "; p-value=", format(p, scientific = T, digits = 2), sep=""),
xlim=c(x.min,x.max), cex=.5)
# Original title with more parameters
#main=paste(plot.name, " \ncount=", sum(binary.score.test), " of ", #length(binary.score.test), "; p-value=", format(p, scientific = T, digits = #2), "; FE=", format(e, scientific = F, digits = 2), "; Prop=", format(prop, #scientific = F, digits = 2), "; n(bg)=", bg, sep=""),
abline(v=sum(binary.score.test), lwd=3, col="red")
#return(list(count.criteria,z,p,e,prop,bg,sum(binary.score.test)))
}
geneset.perm.test(binary.score.reference, binary.score.test, 100000, plot.name)
}
par(mfrow=c(2,2), mai=c(0.9,0.9,0.9,0.9))
permutation_test(E12.5_GLM_SFARI_cat, "E12.5")
permutation_test(E14.5_GLM_SFARI_cat, "E14.5")
permutation_test(E17.5_GLM_SFARI_cat, "E17.5")
permutation_test(E19.5_GLM_SFARI_cat, "P0")
Enrichment of autism-associated genes in our DE gene set, based on randomly sampled gene sets (bars) vs. observed number (red line).
P values presented in the manuscript may be slightly different due to unspecified seed for the pseudorandom number generator during project development. These differences are numerically negligible, and do not have any impact on any conclusions of this work.
#Hypergeometric test
SFARI_genes_hypergeometric_test <- function(x){
#Anotations on the right refer to the post in the link
#https://stats.stackexchange.com/questions/16247/calculating-the-probability-of-gene-list-overlap-between-an-rna-seq-and-a-chip-c
#x <- E17.5_GLM_module_SFARI_cat
set.seed(1234)
n <- length(as.character(x$gene_name)) #Total gene population 15k
a <- length(filter(x, PValue < 0.05)$gene_name) #RNA Seq enriched # 3000
b <- length(filter(x, SFARI_category %in% c(1,2))$gene_name) #Enriched by Chip-Seq 400
t <- length(filter(x, PValue < 0.05, SFARI_category %in% c(1,2))$gene_name) #Intersected 100
hypergeometric_test_p_value <- sum(dhyper(t:b, a, n - a, b))
hypergeometric_test_p_value
}
hyper_E12.5 <- SFARI_genes_hypergeometric_test(E12.5_GLM_SFARI_cat)
hyper_E14.5 <- SFARI_genes_hypergeometric_test(E14.5_GLM_SFARI_cat)
hyper_E17.5 <- SFARI_genes_hypergeometric_test(E17.5_GLM_SFARI_cat)
hyper_P0 <- SFARI_genes_hypergeometric_test(E19.5_GLM_SFARI_cat)
df_p_values <- data.frame("Age" = c("E12.5", "E14.5", "E17.5", "P0"),
"Permutation test P" = c("0.54", "0.87", "7.6e-06", "0.23"),
"Hypergeometric test P" = c(
format(hyper_E12.5, scientific = F, digits = 2),
format(hyper_E14.5, scientific = F, digits = 2),
format(hyper_E17.5, scientific = T, digits = 2),
format(hyper_P0, scientific = F, digits = 2)
))
colnames(df_p_values) <- c("Age", "Permutation test P", "Hypergeometric test P")
#df_p_values
knitr::kable(df_p_values,
format = "html", table.attr = "style='width:40%;'", align = "l", position = "left") %>%
kableExtra::kable_styling(position = "left")
| Age | Permutation test P | Hypergeometric test P |
|---|---|---|
| E12.5 | 0.54 | 0.33 |
| E14.5 | 0.87 | 0.63 |
| E17.5 | 7.6e-06 | 4.5e-06 |
| P0 | 0.23 | 0.93 |
###
# x is a gene name
# y is a plot title, in case there is an alternative gene name
rpkm_box_plot <- function(x, y){
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear_all[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition")
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition))+
geom_smooth(formula = y ~ x, method = "loess", se=T, aes(fill=Condition, group=Condition), size = 0.7, alpha=0)+
geom_boxplot(alpha=0, position="identity", size = 0.2)+
geom_jitter(size=2, pch = 19, width = 0.2, alpha = 0.5)+
theme_bw()+
theme(axis.text.x=element_text(angle=50, vjust=0.9, hjust=1, size=14))+
theme(axis.text.y=element_text(size=12))+
theme(axis.title.y=element_text(size=14))+
labs(title= y, x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_color_manual(values = j_brew_colors)+
scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))+
theme(legend.position = "none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
theme(legend.position = "none")
}
genes <- c("Vegfa", "Flt1", "Pdk1", "Ldha", "Bnip3", "Il20rb")
pl <- lapply(genes, function(x) rpkm_box_plot(x,x))
marrangeGrob(pl, nrow=2, ncol=3, top = "",
layout_matrix = rbind(c(1,2,3), c(4,5,6)))
Figure 3d. RPKM expression plots of genes that are associated with the network of acute DE at E12.5.
genes <- c("Mki67", "Eomes", "Pax6", "Sox9", "Tbr1", "Bcl11b", "Satb2", "Cux1")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9", "Tbr1", "Bcl11b (Ctip2)", "Satb2", "Cux1")
df_genes <- data.frame(genes, gene_names)
pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))
marrangeGrob(pl, nrow=4, ncol=2, top = "",
layout_matrix = rbind(c(1,2), c(3,4), c(5,6), c(7,8)))
Figure 4a. RPKM trajectories of proliferative and cortical lamination markers that were DE and validated at protein level.
genes <- c("Gfap", "Olig2", "Dlx2", "Gad1")
pl <- lapply(genes, function(x) rpkm_box_plot(x,x))
marrangeGrob(pl, nrow=2, ncol=3, top = "",
layout_matrix = rbind(c(1,2), c(3,4)))
Figure 6q. RPKM trajectories of the cell identity markers tested in Figure 6a-k across the developmental time-points of the study show parallel differences in mRNA expression of cell-type markers following MIA induction. Gfap and Olig2 mRNA was significantly upregulated in MIA samples at E17.5. Dlx2 was significantly downregulated at E17.5, with no significant changes observed for Gad1 (encodes GAD67) or for Pdgfra (not shown).
genes <- c("Mki67", "Eomes", "Pax6", "Sox9")
gene_names <- c("Mki67 (Ki67)", "Eomes (Tbr2)", "Pax6", "Sox9")
df_genes <- data.frame(genes, gene_names)
pl <- lapply(1:nrow(df_genes), function(x) rpkm_box_plot(df_genes[x, 1], df_genes[x, 2]))
marrangeGrob(pl, nrow=2, ncol=2, top = "",
layout_matrix = rbind(c(1,2), c(3,4)))
Supplementary Figure 7d-g. RNA-seq RPKM expression trajectories throughout development of Ki67, Tbr2, Pax6, and Sox9.
### Male vs Female DE analysis without lane covariate
single_timepoint_glm_sex <- function(x, y){
#x = "12.5"
#y = "M"
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
control.datapoints <- intersect(control.datapoints, which(dpc == x & sex.by.rna == y))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x & sex.by.rna == y))
use.cols <- c(control.datapoints, polyic.datapoints)
test.sex.by.rna <- sex.by.rna[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
test.group <- group[use.cols]
#test.group
min.cpm <- 0.1
design <- model.matrix(~as.numeric(test.group))
y <- DGEList(counts=test.data, group = as.numeric(test.group))
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM_male <- single_timepoint_glm_sex(12.5, "M")
E14.5_GLM_male <- single_timepoint_glm_sex(14.5, "M")
E17.5_GLM_male <- single_timepoint_glm_sex(17.5, "M")
E19.5_GLM_male <- single_timepoint_glm_sex(19.5, "M")
E12.5_GLM_female <- single_timepoint_glm_sex(12.5, "F")
E14.5_GLM_female <- single_timepoint_glm_sex(14.5, "F")
E17.5_GLM_female <- single_timepoint_glm_sex(17.5, "F")
E19.5_GLM_female <- single_timepoint_glm_sex(19.5, "F")
####################################################################################################
DE_genes_barplot_2 <- function(a,b,c,d, title){
E12.5_up_n <- nrow(filter(a, PValue < 0.05, logFC > 0))
E12.5_down_n <- nrow(filter(a, PValue < 0.05, logFC < 0))
E14.5_up_n <- nrow(filter(b, PValue < 0.05, logFC > 0))
E14.5_down_n <- nrow(filter(b, PValue < 0.05, logFC < 0))
E17.5_up_n <- nrow(filter(c, PValue < 0.05, logFC > 0))
E17.5_down_n <- nrow(filter(c, PValue < 0.05, logFC < 0))
E19.5_up_n <- nrow(filter(d, PValue < 0.05, logFC > 0))
E19.5_down_n <- nrow(filter(d, PValue < 0.05, logFC < 0))
###
E12.5_up_n_FDR <- nrow(filter(a, FDR < 0.05, logFC > 0))
E12.5_down_n_FDR <- nrow(filter(a, FDR < 0.05, logFC < 0))
E14.5_up_n_FDR <- nrow(filter(b, FDR < 0.05, logFC > 0))
E14.5_down_n_FDR <- nrow(filter(b, FDR < 0.05, logFC < 0))
E17.5_up_n_FDR <- nrow(filter(c, FDR < 0.05, logFC > 0))
E17.5_down_n_FDR <- nrow(filter(c, FDR < 0.05, logFC < 0))
E19.5_up_n_FDR <- nrow(filter(d, FDR < 0.05, logFC > 0))
E19.5_down_n_FDR <- nrow(filter(d, FDR < 0.05, logFC < 0))
DE_df_n <- t(data.frame("E12.5" =c(E12.5_up_n, E12.5_down_n, E12.5_up_n_FDR, E12.5_down_n_FDR),
"E14.5" =c(E14.5_up_n, E14.5_down_n, E14.5_up_n_FDR, E14.5_down_n_FDR),
"E17.5" =c(E17.5_up_n, E17.5_down_n, E17.5_up_n_FDR, E17.5_down_n_FDR),
"P0" =c(E19.5_up_n, E19.5_down_n, E19.5_up_n_FDR, E19.5_down_n_FDR)))
colnames(DE_df_n) <- c("Upregulated", "Downregulated", "Upregulated", "Downregulated")
DE_df_n_melted <- reshape2::melt(DE_df_n)
DE_df_n_melted$stat <- c(rep(", P < 0.05", 8), rep(", FDR < 0.05", 8))
DE_df_n_melted$Var2_stat <- paste(DE_df_n_melted$Var2, DE_df_n_melted$stat)
ggplot(DE_df_n_melted, aes(fill=Var2_stat, group=Var2, x=Var1, y=value))+
geom_bar(position = "dodge", stat="identity", color="black")+
theme_bw()+
#scale_fill_manual(values=brewer.pal(n = 8, name = "Paired")[c(6,6,2,2 )])+
scale_fill_manual(values=c("#1F78B4", "#62a0ca", "#E31A1C", "#eb5e60"))+
theme(legend.title=element_blank())+
theme(plot.title = element_text(size = rel(1.4), hjust=0.5))+
geom_text(aes(label=value), position=position_dodge(width=0.9), hjust=-0.2)+
coord_flip()+
scale_x_discrete(limits = rev(levels(DE_df_n_melted$Var1)))+
labs(title= title, x="", y="")+
theme(panel.border = element_blank(),
legend.position = "bottom",
axis.ticks = element_blank(),
axis.text.y = element_text(angle = 90, size = 14, face = "bold",
hjust = 0.5, vjust = -4),
axis.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}
DE_males <- DE_genes_barplot_2(E12.5_GLM_male,
E14.5_GLM_male,
E17.5_GLM_male,
E19.5_GLM_male, "Male DE")
DE_females <- DE_genes_barplot_2(E12.5_GLM_female,
E14.5_GLM_female,
E17.5_GLM_female,
E19.5_GLM_female, "Female DE")
grid.arrange(DE_males, DE_females, ncol = 2)
Supplementary Figure 2cd. Males and females show largely concordant DE signatures following MIA. Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, for the DE analysis performed on males and females demonstrate robust DE at E17.5 in both sexes.
single_timepoint_glm_function_no_sex <- function(x){
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
design <- model.matrix(~as.factor(test.lane) + as.numeric(test.group))
y <- DGEList(counts=test.data, group=group[use.cols])
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(12.5)
E14.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(14.5)
E17.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(17.5)
E19.5_GLM_no_sex <- single_timepoint_glm_function_no_sex(19.5)
DE_model_wo_sex_covariate <- DE_genes_barplot_2(E12.5_GLM_no_sex,
E14.5_GLM_no_sex,
E17.5_GLM_no_sex,
E19.5_GLM_no_sex, "DE model without sex covariate")
###################
single_timepoint_glm_sex_variable <- function(x){
control.datapoints <- which(sample.info$Condition == "Saline")
polyic.datapoints <- which(sample.info$Condition == "PolyIC")
control.datapoints <- intersect(control.datapoints, which(dpc == x))
polyic.datapoints <- intersect(polyic.datapoints, which(dpc == x))
use.cols <- c(control.datapoints, polyic.datapoints)
test.dpc <- dpc[use.cols]
test.sex.by.rna <- sex.by.rna[use.cols]
test.response <- response[use.cols]
test.rRNA <- as.numeric(rRNA)[use.cols]
test.group <- group[use.cols]
test.lane <- as.numeric(lane)[use.cols]
test.data <- exp.data[,use.cols]
min.cpm.criteria <- 0.1
min.cpm <- min.cpm.criteria
design <- model.matrix(~as.factor(test.lane) + as.numeric(test.group) + as.factor(ifelse(test.sex.by.rna == "M", 1, 2)))
y <- DGEList(counts=test.data, group = as.factor(test.sex.by.rna))
keep <- rowSums(cpm(y)>min.cpm) >=2 #keeps only genes expressed in above min.cpm in at least 2 libraries in each group
y <- y[keep, , keep.lib.sizes=FALSE]
y <- estimateGLMCommonDisp(y,design)
y <- estimateGLMTrendedDisp(y,design)
y <- estimateGLMTagwiseDisp(y,design)
fit <- glmFit(y,design)
lrt <- glmLRT(fit) # Genewise Negative Binomial Generalized Linear Models.
glm.output <- topTags(lrt, n=Inf)
glm.output.full <- glm.output$table
glm.output.full$gene_name <- rownames(glm.output.full)
rownames(glm.output.full) <- NULL
glm.output.full <- glm.output.full[,c(6,1:5)]
glm.output.full
}
E12.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(12.5)
E14.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(14.5)
E17.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(17.5)
E19.5_GLM_sex_as_test_variable <- single_timepoint_glm_sex_variable(19.5)
DE_model_sex_as_test_variable <- DE_genes_barplot_2(E12.5_GLM_sex_as_test_variable,
E14.5_GLM_sex_as_test_variable,
E17.5_GLM_sex_as_test_variable,
E19.5_GLM_sex_as_test_variable, "DE model with sex \n as the experimental variable. \n Male vs Female")
grid.arrange(DE_model_wo_sex_covariate, DE_model_sex_as_test_variable, ncol = 2)
Supplementary Figure 2ab. Males and females show largely concordant DE signatures following MIA. (left) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in the DE analysis lacking the sex covariate, are similar to the DE model including the sex covariate (Figure 2c). (right) Numbers of upregulated and downregulated DE genes with P < 0.05 and FDR < 0.05, in a DE analysis testing for differential expression between sexes identifies few DE genes, suggesting limited sex dimorphism between samples.
male_vs_female_DE <- function(x, y, z, title){
#x <- E17.5_GLM_male
#y <- E17.5_GLM_female
#z <- E17.5_GLM
a <- dplyr::select(x, gene_name, logFC, PValue, FDR)
b <- dplyr::select(y, gene_name, logFC, PValue, FDR)
colnames(a) <- c("gene_name", paste0(colnames(a)[2:4], "_", "male"))
colnames(b) <- c("gene_name", paste0(colnames(b)[2:4], "_", "female"))
df <- merge(a, b, by ="gene_name", all=T)
FDR_geneset <- dplyr::filter(z, FDR < 0.05)$gene_name
df <- dplyr::filter(df, gene_name %in% FDR_geneset)
df$Sig <- ifelse(df$FDR_male < 0.05, "Males", "Only in DE model including both sexes")
df$Sig <- ifelse(df$FDR_female < 0.05, "Females", df$Sig)
df$Sig <- ifelse(df$FDR_female < 0.05 & df$FDR_male < 0.05, "Independently in both sexes", df$Sig)
#Capping at -4 and +4
df$logFC_male <- ifelse(df$logFC_male > 4, 4, df$logFC_male)
df$logFC_male <- ifelse(df$logFC_male < -4, -4, df$logFC_male)
df$logFC_female <- ifelse(df$logFC_female > 4, 4, df$logFC_female)
df$logFC_female <- ifelse(df$logFC_female < -4, -4, df$logFC_female)
ggplot()+
geom_abline(intercept = 0, slope =1)+
geom_point(data = filter(df, Sig == "Only in DE model including both sexes"),
aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4, size = 2)+
geom_point(data = filter(df, Sig == "Females"),
aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4, size = 2)+
geom_point(data = filter(df, Sig == "Males"),
aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4, size = 2)+
geom_point(data = filter(df, Sig == "Independently in both sexes"),
aes(x = logFC_male, y = logFC_female, color = Sig, label = gene_name), alpha = 0.4, size = 2)+
theme_bw()+
coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4))+
labs(x="log2FC male", y="log2FC female", title = title, color = "Genes passing FDR < 0.05 \n in the DE model including both sexes \n also DE with FDR < 0.05 in:")+
theme(plot.title = element_text(hjust = 0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)+
scale_color_manual(values = c("Females" = "#619CFF",
"Males" = "#F8766D",
"Independently in both sexes" = "#0e8231",
"Only in DE model including both sexes" = "grey"),
breaks = c("Females",
"Males",
"Independently in both sexes",
"Only in DE model including both sexes"))+
geom_hline(yintercept=0, linetype=2)+
geom_vline(xintercept=0, linetype=2)
}
E12.5_mf <- male_vs_female_DE(E12.5_GLM_male, E12.5_GLM_female, E12.5_GLM, "E12.5")
E14.5_mf <- male_vs_female_DE(E14.5_GLM_male, E14.5_GLM_female, E14.5_GLM, "E14.5")
E17.5_mf <- male_vs_female_DE(E17.5_GLM_male, E17.5_GLM_female, E17.5_GLM, "E17.5")
grid.arrange(E12.5_mf+theme(legend.position = "none"),
E14.5_mf+theme(legend.position = "none"),
E17.5_mf,
ncol = 3, widths = c(1,1,1.8))
Supplementary Figure 3. Sex-stratified comparison of DE genes and effect sizes suggest increased DE effect sizes in females. Correlation of the relative effect sizes (log2FC) for genes passing FDR < 0.05 identifies high degree of concordance between MIA responses between sexes. Genes passing FDR threshold in either or both sexes are color coded.
# save.image("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/temp.RData")
# load("G:/Shared drives/Nord Lab - Computational Projects/MIA/eLIFE_Clean_code_for_GitHub/temp.RData")
# BiocManager::install("RBGL") - required by Vennerable
# install.packages("Vennerable", repos="http://R-Forge.R-project.org")
# These plots can be further customized with VennThemes, or in the Illustrator...
library(Vennerable)
# E14.5
DE_model_including_both_sexes <- filter(E14.5_GLM, FDR < 0.05)$gene_name
Female_DE <- filter(E14.5_GLM_female, FDR < 0.05)$gene_name
ll <- list(DE_model_including_both_sexes, Female_DE)
llv <- Venn(ll, SetNames = c("DE_model_including_both_sexes", "Female_DE"))
plot(llv, doWeights = TRUE, type = "circles")
library(Vennerable)
DE_model_including_both_sexes <- filter(E17.5_GLM, FDR < 0.05)$gene_name
Male_DE <- filter(E17.5_GLM_male, FDR < 0.05)$gene_name
Female_DE <- filter(E17.5_GLM_female, FDR < 0.05)$gene_name
ll <- list(DE_model_including_both_sexes, Male_DE, Female_DE)
llv <- Venn(ll, SetNames = c("DE_model_including_both_sexes", "Male_DE", "Female_DE"))
#plot(llv, doWeights = FALSE)
#plot(llv, doWeights = TRUE)
# Note. Editing Vennerable plos is very difficult, especially in Rmd framework. Consider switching to ggVennDiagram.
myplot1 <- function(){plot(llv, doWeights = TRUE)}
myplot2 <- function(){plot(llv, doWeights = FALSE)}
p1 <- grid.grabExpr(myplot1())
p2 <- grid.grabExpr(myplot2())
grid.arrange(p1, p2, ncol=2)
Supplementary Figure 3b. Venn diagrams illustrating overlap between DE gene sets passing FDR < 0.05 at E14.5 and E17.5, in DE models containing samples from both sexes, males, or females. The DE model comprising individuals of both sexes includes most of the DE signature of each sex. Female DE analysis identifies limited and unique DE signature.
# The order of bars is not exactly the same as in the paper. I decided to omit this due to its complication.
rpkm_box_plot_sex_single_dpc <- function(x, y){
# x <- "Vegfa"
# y <- "14.5"
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"sex.by.rna"]))
colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition", "Sex")
rpkm_test_w_info$Sex_Condition <- as.character(paste0(rpkm_test_w_info$Sex, "_", rpkm_test_w_info$Condition))
rpkm_test_w_info$Sex_Condition_DPC <- as.character(paste0(rpkm_test_w_info$Sex_Condition, "_", rpkm_test_w_info$DPC))
rpkm_test_w_info$Sex <- ifelse(rpkm_test_w_info$Sex == "F", "Female", "Male")
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
rpkm_test_w_info <- filter(rpkm_test_w_info, DPC == y)
ggplot(rpkm_test_w_info, aes(x = x, y= RPKM, colour=Condition, shape = Sex, group = Sex_Condition_DPC))+
geom_jitter(size=2, alpha = 0.5, position = position_jitterdodge(jitter.width = 0.05))+
geom_boxplot(alpha=0, position = position_dodge(width=0.75))+
theme_bw()+
theme(axis.text.x=element_text(size=12, face="bold"))+
labs(title= x, x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
legend.position = "none")+
scale_color_manual(values = j_brew_colors)
}
# Gene selection criteria
# FDR < 0.5 and logFC > 0.5 or < -0.5 in females
# and not logFC > 0.5 or < -0.5 in males.
f <- filter(E14.5_GLM_female, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name #238 genes
m <- filter(E14.5_GLM_male, logFC > 0.5 | logFC < -0.5)$gene_name #2593 genes, FDR criteria removed because of 0 genes passing threshold.
f_but_not_m <- setdiff(f, m)
genes <- filter(E14.5_GLM_female, gene_name %in% f_but_not_m)$gene_name
pl <- lapply(genes[1:12], function(x) rpkm_box_plot_sex_single_dpc(x, "14.5"))
marrangeGrob(list(pl[[1]],pl[[2]], pl[[3]], pl[[4]], pl[[5]],pl[[6]],
pl[[7]], pl[[8]], pl[[9]], pl[[10]], pl[[11]],
pl[[12]]+theme(legend.position = "right")),
nrow=2, ncol=6, top = "", layout_matrix = rbind(c(1:6), c(7:12)))
f <- filter(E17.5_GLM_female, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name #3037 genes
m <- filter(E17.5_GLM_male, FDR < 0.05, logFC > 0.5 | logFC < -0.5)$gene_name #1048 genes,
f_but_not_m <- setdiff(f, m)
genes <- filter(E17.5_GLM_female, gene_name %in% f_but_not_m)$gene_name
pl <- lapply(genes[1:12], function(x) rpkm_box_plot_sex_single_dpc(x, "17.5"))
marrangeGrob(list(pl[[1]],pl[[2]], pl[[3]], pl[[4]], pl[[5]],pl[[6]],
pl[[7]], pl[[8]], pl[[9]], pl[[10]], pl[[11]],
pl[[12]]+theme(legend.position = "right")),
nrow=2, ncol=6, top = "", layout_matrix = rbind(c(1:6), c(7:12)))
Supplementary Figure 3c. Examples of E14.5 and E17.5 genes significant in females but not males. At E14.5 genes were selected to pass FDR < 0.05 and logFC > 0.5 or logFC < -0.5 in females, but not logFC > 0.5 or logFC < -0.5 in males. The gene-set selection process was analogous for E17.5 with the addition of FDR < 0.05 condition in males. After gene set filtering, twelve genes with the lowest FDR in females were plotted.
rpkm_box_plot_sex <- function(x){
#x <- "Vegfa"
rpkm_test <- as.data.frame(reshape2::melt(rpkm.data_linear[x,]))
rpkm_test_w_info <- cbind(rpkm_test, sample.info[,"ExperimentalDesign"])
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"DPC"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"Condition"]))
rpkm_test_w_info <- cbind(rpkm_test_w_info, as.character(sample.info[,"sex.by.rna"]))
colnames(rpkm_test_w_info) <- c("RPKM", "treatment", "DPC", "Condition", "Sex")
rpkm_test_w_info$Sex_Condition <- as.character(paste0(rpkm_test_w_info$Sex_by_RNA, "_", rpkm_test_w_info$Condition))
rpkm_test_w_info$Sex_Condition_DPC <- as.character(paste0(rpkm_test_w_info$Sex_Condition, "_", rpkm_test_w_info$DPC))
head(rpkm_test_w_info)
j_brew_colors <- brewer.pal(n = 8, name = "Paired")[c(6,2)]
ggplot(rpkm_test_w_info, aes(x = DPC, y= RPKM, colour=Condition, shape = Sex, group = Sex_Condition))+
geom_jitter(size=2, alpha = 0.5, position = position_jitterdodge(jitter.width = 0.2))+
geom_boxplot(alpha=0, position = position_dodge(width=0.75), aes(group = Sex_Condition_DPC))+
theme_bw()+
theme(axis.text.x=element_text(size=12, face="bold"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())+
labs(title= x, x="")+
theme(plot.title = element_text(size = rel(2), hjust=0.5))+
scale_color_manual(values = j_brew_colors)+
scale_fill_manual(values = j_brew_colors)+
scale_x_discrete(breaks =c("12.5", "14.5", "17.5", "19.5"), labels = c("E12.5", "E14.5", "E17.5", "P0"))
}
# S6a
x <- c("Vegfa", "Pdk1", "Bnip3", "Flt1", "Ldha", "Il20rb")
pl <- lapply(x, function(x) FUN={
rpkm_box_plot_sex(x)+theme(legend.position = "none")
})
marrangeGrob(pl, nrow = 3, ncol=2, top = "")
# S6b
### ###
x <- c("Mki67", "Pax6", "Tbr1", "Satb2", "Eomes", "Sox9", "Bcl11b", "Cux1")
pl <- lapply(x, function(x) FUN={
rpkm_box_plot_sex(x)+theme(legend.position = "none")
})
marrangeGrob(pl, nrow = 4, ncol=2, top = "")
# S6c
x <- c("Gfap", "Olig2", "Dlx2", "Gad1")
pl <- lapply(x, function(x) FUN={
rpkm_box_plot_sex(x)+theme(legend.position = "none")
})
marrangeGrob(pl, nrow = 2, ncol=2, top = "")
Supplementary Figure 6. Sex-stratified RPKM gene trajectories of genes presented in the manuscript showing concordant effects between males and females. (9.4.1) Vegfa, Flt1, Pdk1, Ldha, Bnip3, Il20rb. (9.4.2) Mki67, Eomes, Pax6, Sox9, Tbr1, Bcl11b, Satb2, Cux1. (9.4.3) Gfap, Dlx2, Olig2, Gad1.
library(pander)
pander(sessionInfo())
R version 4.0.2 (2020-06-22)
Platform: x86_64-w64-mingw32/x64 (64-bit)
locale: LC_COLLATE=English_United States.1252, LC_CTYPE=English_United States.1252, LC_MONETARY=English_United States.1252, LC_NUMERIC=C and LC_TIME=English_United States.1252
attached base packages: grid, stats4, parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: pander(v.0.6.3), Vennerable(v.3.0), xtable(v.1.8-4), gtools(v.3.8.2), reshape(v.0.8.8), RBGL(v.1.64.0), graph(v.1.66.0), biomaRt(v.2.44.4), Hmisc(v.4.4-2), Formula(v.1.2-4), survival(v.3.1-12), lattice(v.0.20-41), plotly(v.4.9.2.2), plyr(v.1.8.6), gridExtra(v.2.3), ggrepel(v.0.8.2), knitr(v.1.30), RColorBrewer(v.1.1-2), DT(v.0.16), data.table(v.1.12.8), dplyr(v.1.0.0), reshape2(v.1.4.4), ggplot2(v.3.3.2), mclust(v.5.4.7), GenomicFeatures(v.1.40.1), AnnotationDbi(v.1.50.3), Biobase(v.2.48.0), GenomicRanges(v.1.40.0), GenomeInfoDb(v.1.24.2), IRanges(v.2.22.2), S4Vectors(v.0.26.1), BiocGenerics(v.0.34.0), edgeR(v.3.30.3), limma(v.3.44.3), pheatmap(v.1.0.12), RUVnormalize(v.1.22.0), sva(v.3.36.0), BiocParallel(v.1.22.0), genefilter(v.1.70.0), mgcv(v.1.8-31) and nlme(v.3.1-148)
loaded via a namespace (and not attached): colorspace(v.1.4-1), ellipsis(v.0.3.1), htmlTable(v.2.1.0), XVector(v.0.28.0), base64enc(v.0.1-3), rstudioapi(v.0.13), farver(v.2.0.3), bit64(v.4.0.5), xml2(v.1.3.2), codetools(v.0.2-16), splines(v.4.0.2), jsonlite(v.1.7.2), Cairo(v.1.5-12.2), Rsamtools(v.2.4.0), annotate(v.1.66.0), cluster(v.2.1.0), dbplyr(v.2.0.0), png(v.0.1-7), compiler(v.4.0.2), httr(v.1.4.2), backports(v.1.1.7), assertthat(v.0.2.1), Matrix(v.1.2-18), lazyeval(v.0.2.2), htmltools(v.0.5.0), prettyunits(v.1.1.1), tools(v.4.0.2), gtable(v.0.3.0), glue(v.1.4.1), GenomeInfoDbData(v.1.2.3), rappdirs(v.0.3.1), Rcpp(v.1.0.6), vctrs(v.0.3.0), Biostrings(v.2.56.0), rtracklayer(v.1.48.0), crosstalk(v.1.1.0.1), xfun(v.0.15), stringr(v.1.4.0), rvest(v.0.3.6), lifecycle(v.0.2.0), XML(v.3.99-0.3), zlibbioc(v.1.34.0), scales(v.1.1.1), hms(v.0.5.3), SummarizedExperiment(v.1.18.2), yaml(v.2.2.1), curl(v.4.3), memoise(v.1.1.0), rpart(v.4.1-15), latticeExtra(v.0.6-29), stringi(v.1.4.6), RSQLite(v.2.2.1), highr(v.0.8), checkmate(v.2.0.0), RUVnormalizeData(v.1.8.0), rlang(v.0.4.9), pkgconfig(v.2.0.3), matrixStats(v.0.57.0), bitops(v.1.0-6), evaluate(v.0.14), purrr(v.0.3.4), labeling(v.0.4.2), GenomicAlignments(v.1.24.0), htmlwidgets(v.1.5.3), bit(v.4.0.4), tidyselect(v.1.1.0), magrittr(v.2.0.1), R6(v.2.5.0), generics(v.0.1.0), DelayedArray(v.0.14.1), DBI(v.1.1.0), foreign(v.0.8-80), pillar(v.1.4.7), withr(v.2.3.0), nnet(v.7.3-14), RCurl(v.1.98-1.2), tibble(v.3.0.1), crayon(v.1.3.4), BiocFileCache(v.1.12.1), rmarkdown(v.2.6), jpeg(v.0.1-8.1), progress(v.1.2.2), locfit(v.1.5-9.4), blob(v.1.2.1), webshot(v.0.5.2), digest(v.0.6.25), tidyr(v.1.1.0), openssl(v.1.4.3), munsell(v.0.5.0), kableExtra(v.1.3.1), viridisLite(v.0.3.0) and askpass(v.1.1)
# Thw workspace is saved at save.image("G:/Shared drives/Nord Lab - Computational Projects/MIA/Canales_et_al_eLIFE_DE_repo/Canales_eLIFE_2021_DE/Canales_2021_eLIFE_DE_workspace.RData")